#  topic 23  Paired values
#
#  Create population
source("../gnrnd5.R")
source("../gnrnd4.R")

gnrnd5(57214499910, 31000011000075) # Generates L1 and L2

# These values represent paired measures on a single
# population.  Generally, this is a "before" and "after"
# measure of some value, i.e., a measure before some
# treatment and a measure after the treatment. 
# For example, perhaps we want to look at how eating 
# flaxseed affects a person's HDL measure. [HDL is
# High Density Lipoprotien, the "good cholesterol] We could
# take HDL measures on people, then have them eat 
# 3 tablespoons of flaxseed each day for a month, then
# take a new measure of their HDL. Then we have a pair
# of measures on those people, a "before" and an "after"
# measure.

# What we have done here is to create such a population
# of measures.
HDL_before <- L1
HDL_after <- L2
          #########################################
          ##  It is somewhat silly to do the    ###
          ##  following if we have all the data,###
          ##  but here we want to see how we    ###
          ##  can get a confidence interval for ###
          ##  the change in HDL level and/or do ###
          ##  a hypothesis test on H0: no change###
          ##  versus H1: HDL went up.           ###
          #########################################

# To generate a confidence interval for the change in
# HDL levels we want to take a sample of the population.

# generate the index values for a sample of size 31
gnrnd4( 885033001, 500000001)
L1
pre_HDL <- HDL_before[ L1 ]
post_HDL <- HDL_after[ L1 ]

#  look at the scores
pre_HDL
post_HDL

#  but these are paired values.  We can find how much of
#  an increase each of these 31 participants showed
HDL_increase <- post_HDL - pre_HDL
HDL_increase

#  Now we want to get a 95% confidence interval for that
#  increase.  But the increase is just a single value
#  for each participant.  We are really back in the 
#  case of getting a confidence interval for the 
#  population mean based on a sample where we do not 
#  know the population standard deviation.

source( "../ci_unknown.R")
ci_unknown( sd( HDL_increase), length(HDL_increase),
            mean(HDL_increase), 0.95 )

###################################
##  Now, since we actually have the measures for the 
##  entire population, let us get the true mean of 
##  the increase.
###################################
pop_increase <- HDL_after - HDL_before
mu <- mean( pop_increase )
mu 
#  our sample generated a 95% confidence interval that
#  did not contain the true mean!  What went wrong?
#  Nothing, it just turned out that this was one
#  of the 5% generated confidence intervals that do
#  not contain the true mean.

#  To show this, let us generate 10000 such 
#  samples and get from them the 10000 confidence
#  intervals.  Then, see how many of those
#  contain the true mean increase

L1 <- 1:10000
for ( i in 1:10000 ) {   
  this_samp <- sample( pop_increase, 31 )
  this_interval <- ci_unknown( sd( this_samp), 31,
                               mean(this_samp), 0.95)
  if( as.numeric(this_interval[1]) < mu  &
      mu < as.numeric( this_interval[2]) ) {
         L1[i] <- "contains" }
     else {
         L1[i] <- "misses" }
  }
 
table( L1 )  # should give around 5% misses

#########################################
##     Hypotheses test                 ##
#########################################

# In this class the only null hypothesis that
# we will use for this type of problem is that
# the mean difference in the values is 0.  But,
# for us, in this situation, that is H0: mu=0.
# The alternative could be any of the usual
# three suspects:  >, < !=

# We will use H1: mu>0

# But, again, this is just a hypothesis test for the
# mean based on a sample where we do not know the 
# population standard deviations.  That was the case
# when we would use hypoth_test_unknown().

# We will do this at the 0.05 level of significance.
# Recall that our sample is still in HDL_increase
HDL_increase
source("../hypo_unknown.R")
hypoth_test_unknown( 0, 1, 0.05, 31, 
                     mean( HDL_increase),
                     sd(HDL_increase))
# this resounding rejection is not a shock since it
# was this sample that gave us the confidence interval
# that did not contain the true mean.

# We could run this same test with 10000 samples to see
# how often we would reject H0 based on those samples

L1 <- 1:10000
for( i in 1:10000 ) {

  this_samp <- sample( pop_increase, 31 )
  this_test <- hypoth_test_unknown( 0, 1, 0.05,
                       31,   mean(this_samp),
                       sd( this_samp))
  L1[i] <- this_test[14]
}

table( L1 )
#  And, again, we should not expect to have a 85% rejection
#  because the true mean  difference is 2.0422, not 0.

#  Let us start with a new population, but this one where 
#  there is little change, still a change, but not much of one

gnrnd5(77344499910, 3000011000075) # Generates L1 and L2
HDL_before <- L1
HDL_after <- L2
pop_increase <- HDL_after - HDL_before
mean( pop_increase )

#   now try the demo of 10000 samples
L1 <- 1:10000
for( i in 1:10000 ) {

  this_samp <- sample( pop_increase, 31 )
  this_test <- hypoth_test_unknown( 0, 1, 0.05,
                                    31,   mean(this_samp),
                                    sd( this_samp))
  L1[i] <- this_test[14]
}

table( L1 )